# Immigration and the top 1%

# characteristics of joiners, leavers, and stayers


## calculate distribution of YOA by joiner or stayer

# pick base year
base_year <- 2017

# pick top share for file name
ts_vec <- c('1', '01')


# loop over top shares
tab <- map_dfr(ts_vec, function(ts_string) {
  # upload churn data
  data <-
    read_fst(
      paste(
        'output/top',ts_string,'_base_year',base_year,'.fst',
        sep = ""
      )
    )
  
  # tibble
  data <- as_tibble(data)
  
  
  # utr list
  utrlist <- data$anon_utr
  
  # remove NAs
  utrlist <- utrlist[which(!is.na(utrlist))]
  
  # nino list
  ninolist <- data$nino_anon

  # remove NAs
  ninolist <- ninolist[which(!is.na(ninolist))]

  # ID list
  idlist <- data$id
  
  # remove NAs
  idlist <- idlist[which(!is.na(idlist))]
  
  # # # #
  
  ## merge yoa information
  
  # subset yoa data (easier to merge)
  small_yoa <- yoa %>%
    filter(migrant_comb == 1) %>%
    filter(anon_utr %in% utrlist | nino_anon %in% ninolist) %>% #id %in% idlist - no ID in YOA dataset
    select(anon_utr, nino_anon, starts_with('arrival'))
  
  # tibble
  small_yoa <- as_tibble(small_yoa)
  
  
  # merge
  data <- left_join(data, small_yoa, by = c('anon_utr', 'nino_anon'))
  
  # combine arrival variables into one
  data <- data %>%
    mutate(
      arrival_new = case_when(
        !is.na(arrival_mws) & arrival_mws >= 2000 ~ arrival_mws,
        is.na(arrival_mws) ~ arrival_ninopf_early,
        arrival_mws < 2000 ~ arrival_ninopf_early,
        is.na(arrival_ninopf_early) &
          arrival_mws < 2000 ~ arrival_mws
      )
    )
  
  
  
  # # # #
  
  ## define groups: joiner and stayer
  data <- data %>%
    mutate(
      group_persist = case_when(
        year_base == 1 & year_minus_1 == 1 ~ 'stayer',
        year_base == 1 &
          year_minus_1 == 0 ~ 'joiner',
        TRUE ~ NA_character_
      )
    )
  
  # filter based on arrival year
  dgraph <- data %>%
    mutate(arrival_var = arrival_new) %>%
    filter(arrival_var <= base_year)
  
  # recode YOA for stayers who have year of arrival == base year
  # small mistakes, but otherwise creates primary disclosure issues
  dgraph <- dgraph %>%
    mutate(arrival_var = if_else(group_persist == 'stayer' & arrival_var == base_year,
                                 arrival_var - 1, arrival_var))
  
  # combine years of arrival before 1990 for Top0.1pc
  if (ts_string == '01') {
    breaks_vec <- c(0, seq(1976, 1990, 2), 1991:base_year) # group by 2 for years before 1990
  } else {
    breaks_vec <- c(0, 1975:base_year) # no grouping
  } 
  
  # arival var in groups
  dgraph <- dgraph %>%
    mutate(arrival_grouped = cut(arrival_var, breaks = breaks_vec,
                                 labels = F))
  
  # relabel arrival var in groups
  dgraph <- dgraph %>%
    group_by(arrival_grouped) %>%
    mutate(arrival_grouped_max = max(arrival_var))
  
  # counts
  dgraph <- dgraph %>%
    group_by(arrival_grouped_max, group_persist) %>%
    summarise(n=n())
  
  # totals
  tot <- dgraph %>%
    group_by(group_persist) %>%
    summarise(group_tot = sum(n, na.rm = T))
  
  
  # merge total back
  dgraph <- left_join(dgraph, tot, by = 'group_persist')
  
  # calculate proportion
  dgraph <- dgraph %>%
    mutate(prop = n/group_tot)
  
  
  # plot
  dgraph %>%
    ungroup() %>%
    add_row(arrival_grouped_max = base_year, group_persist = 'stayer', n = NA) %>%
    ggplot(., aes(arrival_grouped_max, fill = group_persist)) +
    geom_col(position = 'dodge', aes(y = prop)) +
    scale_fill_manual(values = c('navy', 'dodgerblue')) +
    xlab('Arrival Year') +
    ylab('Proportion') +
    theme_minimal()
  
  # save graph
  file_name <-
    paste('YOA_Top', ts_string, '_base_year', base_year, '.pdf', sep = '')
  
  # ggsave
  ggsave(
    filename = file_name,
    path = 'output/',
    dpi = 'retina',
    height = 9,
    width = 16,
    units = 'cm'
  )
  
  
  # top share as numeric
  if (ts_string == '1') {
    ts <- .01
  } else if (ts_string == '01') {
    ts <- .001
  } else if (ts_string == '001') {
    ts <- .0001
  }
  
  # add top share variable
  dgraph <- dgraph %>%
    mutate(ts = ts)
  
  # return
  return(dgraph)
  
})



# write csv
file_name <- paste('YOA', '_base_year', base_year, '.csv', sep = '')
file_name <-
  paste(
    'output/',
    file_name,
    sep = ""
  )
write_csv(tab, file_name)



# # # #

## combine csv files for all base years

# list of files
file_list <- list.files('output/',
                        full.names = T)

# remove previously combined file
ind <- grep('allyears', file_list, invert = T)
file_list <- file_list[ind]

# loop over file name
combined_tab <- map_dfr(file_list, function(file_name){
  
  # upload
  tab <- read_csv(file_name)
  
  # extract base year
  base_year <- str_extract(file_name, 'year[:digit:]{4}')
  base_year <- gsub('year', "", base_year)
  
  # add base year variable
  tab <- tab %>%
    mutate(base_year = base_year)

  # return
  return(tab)
    
})

min(combined_tab$n)

# write combined table
write_csv(combined_tab, 'output/YOA_allyears.csv')

